The file "concrete.csv" contains important material data used in civil engineering. The concerete compressive strength - 'strength', is the target label to be analysed.
Goal
Using the data available in file concrete.csv, apply feature engineering methods to obtain 85% to 95% accuracy (tolerance limit 95% of the time (confidence level).
Resources Available
The data for this project is available in file https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/
Attributes details:
Independent variables:
Target:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
import scipy.stats as stat
from scipy.stats import zscore
from scipy.stats import randint as sp_randint
import statsmodels.api as sm
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor,BaggingRegressor)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from time import time
sns.set() # Setting Seaborn default styles
concrete_df = pd.read_csv('concrete.csv')
concrete_df.head(10)
concrete_df.tail(10)
df = concrete_df.copy() # Take the backup of original copy and keep working on this copy
Observations:
This section involves finding,
print ('Information of all columns within the dataframe: ')
print (df.info())
print ('************************************************************************************\n')
print('Shape of the attributes: ')
print (df.shape)
print ('************************************************************************************\n')
print('types of the attributes: ')
print (df.dtypes)
print ('************************************************************************************\n')
df.apply(lambda x: len(x.unique())) # To check whether any dependent attributes are categorical
# Checking Missing values
temp_df = df[df.isna().any(axis=1)].count()
if temp_df['age'] == 0:
print ('No missing values')
else:
print ('There are missing values')
print (temp_df)
# Verify which of the columns has the -Ve values
for col in df.columns.tolist():
number_of_exp_with_neg = df[df[col] <0].shape[0]
if (df[df[col] <0][col].count() == 0):
print ('No -Ve values in attribute: ' + col)
else:
print ('{0} attribute has {1} -ve values'.format(col, str(number_of_exp_with_neg)))
Observations:
df.describe().T
Observations:
Let's strong the above observations by using visualization techniques.
# Function is to find the skewness of Quantitative attributes
def findSkewness(quant_value, col):
if (quant_value.loc[.50] - quant_value.loc[.25]) > (quant_value.loc[.75] - quant_value.loc[.50]):
print('\'' + col + '\' is left-skewed')
elif (quant_value.loc[.50] - quant_value.loc[.25]) < (quant_value.loc[.75] - quant_value.loc[.50]):
print('\'' + col +'\' is right-skewed')
else:
print('\'' + col + '\' Attribute data distribution is symmetric')
# Finding the outliers
# IQR = Q3 - Q1
# Q1 − 1.5 IQR or above Q3 + 1.5 IQR.
def findOutliers(quant_value, col):
Q1 = quant_value[.25]
Q3 = quant_value[.75]
IQR = Q3 - Q1
outlier_left = (Q1 - 1.5 * IQR)
outlier_right = (Q3 + 1.5 * IQR)
number_of_outliers = (concrete_df[(concrete_df[col] <= outlier_left) | (concrete_df[col] >= outlier_right)][col].count())
print("Number of Outliers data in", col, ": " + str(number_of_outliers))
df_quant=df[:].quantile([.25,.50,.75])
cols = df.columns.tolist()
for col in cols:
findSkewness (df_quant[col], col)
print('**********************************************************************')
for col in cols:
findOutliers(df_quant[col], col)
Note:</br> None of the attributes are categorical, hence no need of any transformations are not applicable.
Points to note:
# Now let's plot all continous variables
def univariant_plot(number_of_attrs):
fig, axs = plt.subplots(number_of_attrs, 3, figsize=(20,35))
# Number of columns from the dataset is 9, computing the dataframe index directly using row and col
for row in range(number_of_attrs):
sns.boxplot(df.iloc[:, row], ax=axs[row, 0])
sns.distplot(df.iloc[:, row], ax=axs[row, 1], axlabel=df.columns[row], kde=False)
sns.distplot(df.iloc[:, row], ax=axs[row, 2], axlabel=df.columns[row])
plt.show()
univariant_plot(9)
Inferences:
For example:
Observations on Outliers:
Let's expedite % of outliers of individual attributes
# Number of records in 'ash' with '0' values
for col in df.columns.tolist():
num_ash_with_0 = df[df[col] == 0].shape[0]
_percentage_of_0s = num_ash_with_0 / df.shape[0]
print (col + ' has ' + str(_percentage_of_0s) + ' % of 0s')
Final Inferences on univarient analysis:
Number of records with '0' present for slag(46%), ash(55%), superplastic(37%). Hence these records for these attributes can be missing values.
Note: Bivarient plots w.r.to target (strength) attribute
# Scatterplot w.r.to target variable
number_of_attrs = 8
def bivariant_plot():
for row in range(number_of_attrs):
sns.jointplot(df.iloc[:, row], df.iloc[:, 8], kind='hex')
plt.show()
fig, axs = plt.subplots(1, 2, figsize=(10,5))
sns.scatterplot(x=df.columns.get_values()[row], y="strength", data=df, ax=axs[0])
sns.kdeplot(df[df.columns.get_values()[row]], df.strength, cmap="Reds", shade=True, shade_lowest=False, ax=axs[1])
plt.show()
bivariant_plot()
Observations:
# Pairplot, plotting individual attribute w.r.to target
sns.pairplot(df)
Observations: Not much observation from the above pairplot - in general
Analysis of each attribute:
for row in range(8):
sns.jointplot(df.iloc[:, row], df.iloc[:, 8], kind='kde')
plt.show()
# corrlation matrix
cor=df.corr()
cor
#heatmap
sns.set(font_scale=1.15)
plt.figure(figsize=(15, 10))
sns.heatmap(cor, vmax=.8, linewidths=0.01,
square=True,annot=True,cmap="BuPu",linecolor="blue")
plt.title('Correlation between attributes');
Observations:
As none of the correlation is beyond 50% and between 20-30% with +ve/-ve correlation. we cannot drop any of the attributes at present with this analysis, and proceed with existing dataset.
1. Give data analysis:
- There are 8 independent attributes ( cement, slag, ash, water, superplastic, coarseagg, fineagg, age) and one dependent variable (strength).
- All the records are numeric.
- There are no missing values in any of the columns.
- All attributes are continuous type.
- All attributes are float type, except age (age is of integer type).
- None of the attribute has any negative values.
- There are 1030 records in the dataframe.
- No string present for any of the attribute values.
2. Descriptive statistics analysis:
- cement, water, coarseagg, fineagg, strength: There is not much difference in mean and median (50%).
- Attributes: slag and ash, age, there is much difference in mean and 50% values, indicating mean > median, so being a right skewness in data.
- Minimum values of slag, ash and superplastic are '0', hoping these values cannot be accepted in estimating the concrete strength.
- Min value should be less than std, however 'cement' has 'min' and 'std' are almost same. And for 'water, coarseagg, fineagg' std is much lesser than 'min. Hence these attributes needs to be deep-dived.
3. Univariant visualization:
- Looks like there are multiple gaussians present in some of the attributes.
- For example: § slag, superplasting, courseagg and age has intersecting gaussians. And also there is 37% of data from superplastic, and 46% of data from slag are '0's. § ash: we can visualize 2 gausians, they both can be at 50 value. And also 55% of the records contain '0' values.
- Strength and cement are almost normally distributed, however not a perfect normal distribution.
- outliers present for age, superplastic, slag, lineagg, water, strength
4. Outliers:
- Huge number of outliers present for age and superplastic attributes.
- ash has minimal level of outliers, where as slag has very minimum outliers.
1. Each Attribute distribution:
- cement is almost normal, and with littel right-skewness
- Assuming slag has 2 and rightly skewed.
- ash has 2 gaussians (i.e. 2 clusters), looks like independent to each other and rightly skewed. 1 cluster with '0' values. If we consider '0' ash values are missing values, then ash dataset is with only one cluster.
- water has 3 guassians and slighly left skewed.
- superplastic has 2 gaussians which are intersecting and rightly skewed.
- coarseagg has 3 guassians and almost normal.
- fineagg almost looks normal.
- age has multiple guassians and rightly skewed.
- stregth is almost normal.
- Number of records with '0' present for slag(46%), ash(55%), superplastic(37%). Hence these records for these attributes can be missing values.
2. Correlation:
- cement: this has negative correlation with slag, ash, and fineagg. And 49.7% positive correlation with target (strength). Most of the values located around cement values 100-400 range
- slag: this has negative correlation with ash, coarseagg, fineagg. Little positive correlation (13%) with target (strength).
- ash: negative correlation with water. positive correlation with superplastic. Little negative correlation (10%) with target (strength)
- water: strong negative correlation with superplastic, fineagg. ~27% positive correlation with age. Negative correlation with target (strength).
- superplastic: negative correlation with coarseagg. And positive correlation almst 22% and 36% with fineagg and target (strength)
- coarseagg: very light -ve correlation with fineagg and strength.
- fineagg: very less negative correlation with age and strength.
- age: 33% positive correlation with strength.
# Let us draw lmplot to better analyze relationship among independent variables
# lmplot of
# cement Vs {slag, ash, fineagg}
# slag Vs {ash, coarseagg, fineagg}
# ash Vs {water, superplastic}
# water Vs {superplastic, fineagg}
# superplastic Vs coarseagg
# coarseagg Vs fineagg
# fineagg Vs age
indep_attr_dict = {
'cement': ['slag', 'ash', 'fineagg'],
'slag' : ['ash', 'coarseagg', 'fineagg'],
'ash' : ['water', 'superplastic'],
'water' : ['superplastic', 'fineagg'],
'superplastic' : ['coarseagg'],
'coarseagg' : ['fineagg'],
'fineagg' : ['age']
}
df = concrete_df.copy()
for key in indep_attr_dict:
value = indep_attr_dict[key]
graph_len = len(value)
if (graph_len > 1):
fig, axs = plt.subplots(1, graph_len, figsize=(10,5), sharey=True)
for indx in range(graph_len):
sns.regplot(x=df[key], y=df[value[indx]], ax=axs[indx])
else:
sns.regplot(x=df[key], y=df[value[0]])
plt.show()
Observations:
df.boxplot(figsize=(35,15))
Observations:
# age is integer type, which can be grouped.
tmp_df = concrete_df.copy()
print (tmp_df['age'].value_counts())
print (tmp_df['fineagg'].value_counts())
#Checking for missing values
df.isnull().sum()
#Replacing the outliers by median
filled_outliers_df = concrete_df.copy()
for col in filled_outliers_df.columns[:-1]:
q1 = filled_outliers_df[col].quantile(0.25)
q3 = filled_outliers_df[col].quantile(0.75)
iqr = q3 - q1
low = q1-1.5*iqr
high = q3+1.5*iqr
filled_outliers_df.loc[(filled_outliers_df[col] < low) | (filled_outliers_df[col] > high), col] = filled_outliers_df[col].median()
filled_outliers_df.boxplot(figsize=(35,15))
Observations:
age number of records
365 14
270 13
360 6
filled_outliers_df['age'].value_counts()
sns.pairplot(filled_outliers_df)
However still clusters did not get reduced. Neither strong correlation among independent attributes, nor between independent attribute Vs target (strength).
# Divide the data into X and y datasets (Independent attributes --> X, Dependent attribute --> y)
df = concrete_df.copy()
X = df.drop('strength', axis=1)
y = df[['strength']]
# Scale all columns of concrete_df
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled, columns=y.columns)
print ('X_scaled: ')
print (X_scaled)
print ('y_scaled: ')
print (y_scaled)
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.30, random_state=1)
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
y_pred = regression_model.predict(X_test)
print('Performance on training data using Linear Regression:',regression_model.score(X_train, y_train))
print('Performance on testing data using Linear Regression:',regression_model.score(X_test, y_test))
acc = metrics.r2_score(y_test, y_pred)
print('Accuracy: ',acc)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
# Regression plot between y_test and y_pred (Test Vs Predicted) values
with sns.axes_style("darkgrid"):
j = sns.jointplot(x=y_test, y=y_pred, kind='reg', color='blue')
j.annotate(stat.pearsonr)
plt.show()
#Store the accuracy results for each model in a dataframe for final comparison
results = pd.DataFrame({'Method':['Linear Regression'], 'accuracy': acc},index={'1'})
results = results[['Method', 'accuracy']]
results
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
for idx, col_name in enumerate(X_train.columns):
print("Ridge model coefficient for {} is {}".format(col_name, ridge.coef_[0][idx]))
y_pred = ridge.predict(X_test)
print('Performance on training data using Ridge Regression:',ridge.score(X_train, y_train))
print('Performance on testing data using Ridge Regression:',ridge.score(X_test, y_test))
acc = metrics.r2_score(y_test, y_pred)
print('Accuracy: ',acc)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
# Regression plot between y_test and y_pred (Test Vs Predicted) values
with sns.axes_style("darkgrid"):
j = sns.jointplot(x=y_test, y=y_pred, kind='reg', color='blue')
j.annotate(stat.pearsonr)
plt.show()
#Store the accuracy results for each model in a dataframe for final comparison
results = pd.concat([results, pd.DataFrame({'Method':['Ridge Regression'], 'accuracy': [acc]},index={'2'})])
results = results[['Method', 'accuracy']]
results
Observation:
Let's check the same using Lasso, where Lasso dops the least significant attributes.
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
idx = 0
for col_name in enumerate(X_train.columns):
print("Lasso model coefficient for {} is {}".format(col_name, lasso.coef_[idx]))
idx = idx+1
y_pred = lasso.predict(X_test)
print('Performance on training data using Lasso Regression:',lasso.score(X_train, y_train))
print('Performance on testing data using Lasso Regression:',lasso.score(X_test, y_test))
acc = metrics.r2_score(y_test, y_pred)
print('Accuracy: ',acc)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
y_pred = np.reshape(y_pred, (-1, 1))
# Regression plot between y_test and y_pred (Test Vs Predicted) values
with sns.axes_style("darkgrid"):
j = sns.jointplot(x=y_test, y=y_pred, kind='reg', color='blue')
j.annotate(stat.pearsonr)
plt.show()
#Store the accuracy results for each model in a dataframe for final comparison
results = pd.concat([results, pd.DataFrame({'Method':['Lasso Regression'], 'accuracy': [acc]},index={'3'})])
results = results[['Method', 'accuracy']]
results
Observations:
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])
y_pred = regression_model.predict(X_test)
print('Performance on training data using Linear Regression with Polynomial:',regression_model.score(X_train, y_train))
print('Performance on testing data using Linear Regression with Polynomial:',regression_model.score(X_test, y_test))
acc = metrics.r2_score(y_test, y_pred)
print('Accuracy: ',acc)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
# Regression plot between y_test and y_pred (Test Vs Predicted) values
with sns.axes_style("darkgrid"):
j = sns.jointplot(x=y_test, y=y_pred, kind='reg', color='blue')
j.annotate(stat.pearsonr)
plt.show()
#Store the accuracy results for each model in a dataframe for final comparison
results = pd.concat([results, pd.DataFrame({'Method':['Linear Regression with Polynomial'], 'accuracy': [acc]},index={'4'})])
results = results[['Method', 'accuracy']]
results
Observation:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model Coefficients:", (ridge.coef_))
y_pred = ridge.predict(X_test)
print('Performance on training data using Ridge Regression with Polynomial:',ridge.score(X_train, y_train))
print('Performance on testing data using Ridge Regression with Polynomial:',ridge.score(X_test, y_test))
acc = metrics.r2_score(y_test, y_pred)
print('Accuracy: ',acc)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
# Regression plot between y_test and y_pred (Test Vs Predicted) values
with sns.axes_style("darkgrid"):
j = sns.jointplot(x=y_test, y=y_pred, kind='reg', color='blue')
j.annotate(stat.pearsonr)
plt.show()
#Store the accuracy results for each model in a dataframe for final comparison
results = pd.concat([results, pd.DataFrame({'Method':['Ridge Regression with Polynomial'], 'accuracy': [acc]},index={'5'})])
results = results[['Method', 'accuracy']]
results
lasso = Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))
y_pred = lasso.predict(X_test)
print('Performance on training data using Lasso Regression with Polynomial:',lasso.score(X_train, y_train))
print('Performance on testing data using Lasso Regression with Polynomial:',lasso.score(X_test, y_test))
acc = metrics.r2_score(y_test, y_pred)
print('Accuracy: ',acc)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
y_pred = np.reshape(y_pred, (-1, 1))
# Regression plot between y_test and y_pred (Test Vs Predicted) values
with sns.axes_style("darkgrid"):
j = sns.jointplot(x=y_test, y=y_pred, kind='reg', color='blue')
j.annotate(stat.pearsonr)
plt.show()
#Store the accuracy results for each model in a dataframe for final comparison
results = pd.concat([results, pd.DataFrame({'Method':['Lasso Regression with Polynomial'], 'accuracy': [acc]},index={'6'})])
results = results[['Method', 'accuracy']]
results
Note:
Note:
Except age, all other attributes units are different. Hence all attributes has to be placed in same units. Hence using Z score for scaling.
# Scaling the dataset using Z-score to keep all attributes units to same
filled_outliers_df_z = filled_outliers_df.apply(zscore)
filled_outliers_df_z=pd.DataFrame(filled_outliers_df_z,columns=concrete_df.columns)
dtree_model = DecisionTreeRegressor()
def build_dtree_model(df_z, attr_tobe_considered):
# Step: 1 - Separating independent and dependent attributes
if (attr_tobe_considered == 'all'):
X = df_z.iloc[:,0:8]
else:
X = df_z.iloc[:, [0, 1, 2, 3, 7]]
y = filled_outliers_df_z.iloc[:,8]
# Step: 2 - Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 1)
# Step: 3 - Buidling model
dtree_model.fit(X_train , y_train)
# printing the feature importance
print('Feature importances: \n',pd.DataFrame(dtree_model.feature_importances_,columns=['Imp'],index=X_train.columns))
print ('*****************************************************************')
# Step: 4 - Performance computation
y_pred = dtree_model.predict(X_test)
print(X_test)
print('Performance on training data using DT:',dtree_model.score(X_train, y_train))
print('Performance on testing data using DT:',dtree_model.score(X_test, y_test))
acc_DT = metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
# Step: 5 - Regression plot between y_test and y_pred (Test Vs Predicted) values
with sns.axes_style("darkgrid"):
j = sns.jointplot(x=y_test, y=y_pred, kind='reg', color='blue')
j.annotate(stat.pearsonr)
plt.show()
return acc_DT
acc = build_dtree_model(filled_outliers_df_z, 'all')
There is a huge difference in performance on trailing and testing. Which indicates overfitting.
#Store the accuracy results for each model in a dataframe for final comparison
results = pd.concat([results, pd.DataFrame({'Method':['Decision Tree'], 'accuracy': [acc]},index={'7'})])
results = results[['Method', 'accuracy']]
results
Observations:
def kfold_cross_validation(method, indx, results, dtree_model):
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=seed)
results1 = cross_val_score(dtree_model,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Kfold Average accuracy: ',accuracy)
print('Kfold Standard Deviation: ',results1.std())
results = pd.concat([results, pd.DataFrame({'Method':[method], 'accuracy': [accuracy]},index={indx})])
results = results[['Method', 'accuracy']]
return results
results = kfold_cross_validation('Decision Tree K fold','8', results, dtree_model)
results
df_z_2 = filled_outliers_df_z.copy()
accuracy = build_dtree_model(df_z_2, 'dropping_attr')
results = pd.concat([results, pd.DataFrame({'Method':'Decision Tree drop least attr', 'accuracy': [accuracy]},index={'9'})])
results = results[['Method', 'accuracy']]
results
There is no improvement in accuracy from training to testing data. Still overfit model.
# independent and dependent variables
X = filled_outliers_df_z.iloc[:,0:8]
y = filled_outliers_df_z.iloc[:,8]
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 1)
# Regularizing the Decision tree classifier and fitting the model
reg_dt_model = DecisionTreeRegressor( max_depth = 11,random_state=1,min_samples_leaf=5)
reg_dt_model.fit(X_train, y_train)
print (pd.DataFrame(reg_dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))
# View a list of the features and their importance scores
tmp_df = concrete_df.copy()
importances = reg_dt_model.feature_importances_
indices = np.argsort(importances)[::-1][:15]
a = tmp_df.columns[:]
print (a)
features= a.drop('strength',1)
#plot it
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
Observations:
y_pred = reg_dt_model.predict(X_test)
print('Performance on training data using DT:',reg_dt_model.score(X_train,y_train))
print('Performance on testing data using DT:',reg_dt_model.score(X_test,y_test))
acc_RDT=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_RDT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
results = pd.concat([results, pd.DataFrame({'Method':'Pruned Decision Tree', 'accuracy': [acc_RDT]},index={'10'})])
results = results[['Method', 'accuracy']]
results
results = kfold_cross_validation('Pruned Decision Tree K fold', '11', results, reg_dt_model)
results
X = filled_outliers_df_z.iloc[:, [0, 1, 2, 3, 7]]
y = filled_outliers_df_z.iloc[:,8]
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 1)
# Regularizing the Decision tree classifier and fitting the model
reg_dt_model = DecisionTreeRegressor( max_depth = 4,random_state=1,min_samples_leaf=5)
reg_dt_model.fit(X_train, y_train)
y_pred = reg_dt_model.predict(X_test)
print('Performance on training data using DT:',reg_dt_model.score(X_train,y_train))
print('Performance on testing data using DT:',reg_dt_model.score(X_test,y_test))
acc_RDT=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_RDT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
results = pd.concat([results, pd.DataFrame({'Method':'Pruned Decision Tree drop least attr', 'accuracy': [acc_RDT]},index={'12'})])
results = results[['Method', 'accuracy']]
results
cluster_range = range( 1, 15 )
cluster_errors = []
df = concrete_df.copy()
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 5)
clusters.fit(df)
labels = clusters.labels_
centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:15]
# Elbow plot
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
Observations:
df = filled_outliers_df.copy()
concrete_df_z = df.apply(zscore)
# k=4
cluster = KMeans( n_clusters = 4, random_state = 2354 )
cluster.fit(concrete_df_z)
# Creating a new column "GROUP" which will hold the cluster id of each record
prediction=cluster.predict(concrete_df_z)
concrete_df_z["GROUP"] = prediction
# Creating a mirror copy for later re-use instead of building repeatedly
concrete_df_z_copy = concrete_df_z.copy(deep = True)
centroids = cluster.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(df) )
centroid_df
# Instead of numerical values of centroids, dawa a box plot of centroids and data
concrete_df_z.boxplot(by = 'GROUP', layout=(3,3), figsize=(15, 10))
From the previous analysis, among all attributes 'cement' is good predictor. Hence let's visualize clusters within 'cement' attribute.
var = 'cement'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=concrete_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
Observation: Body of 4 clusters are overlapping. Hence cannnot separate the data into multiple clusters using KMeans. Hence proceed with PCA analysis with this 4 group.
# Kmeans clustering is not helping at all. Let's try PRINCIPAL COMPONENT ANALYSIS
# Apply PCA for each group
group = []
group.append(concrete_df_z[concrete_df_z['GROUP']==0])
group.append(concrete_df_z[concrete_df_z['GROUP']==1])
group.append(concrete_df_z[concrete_df_z['GROUP']==2])
group.append(concrete_df_z[concrete_df_z['GROUP']==3])
def pca_analysis(indx):
independent_attr = group[indx].drop(['strength', 'GROUP'] , axis = 1)
dependent_attr = np.array(group[indx].pop('strength'))
# pca = PCA(4)
# group_projected = pca.fit_transform(independent_attr) Reason for avoiding this is the PCA will
# automatically convert data to z scores which is already done. Hence doing the steps of PCA one by one
cov_matrix = np.cov(independent_attr, rowvar=False)
np.linalg.eig(cov_matrix)
eigenvalues, eigenvectors, = np.linalg.eig(cov_matrix)
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Sort the (eigenvalue, eigenvector) pairs from lowest to highest with respect to eigenvalue
eig_pairs.sort()
eig_pairs.reverse() # reverses the sorted pairs from increasing value of eigenvalue to lowest
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sort = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sort = [eig_pairs[index][1] for index in range(len(eigenvalues))]
tot = sum(eigenvalues)
var_explained = [(i / tot) for i in sorted(eigenvalues, reverse=True)] # an array of variance explained by each
# eigen vector... there will be 4 entries as there are 4 eigen vectors)
cum_var_exp = np.cumsum(var_explained) # an array of cumulative variance. There will be 8 entries with 8 th entry
# cumulative reaching almost 100%
print ('cum_var_exp: ', cum_var_exp)
plt_title = 'GROUP - ' + str(indx+1)
plt.bar(range(0, 8), var_explained, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(0,8),cum_var_exp, where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.title(plt_title)
plt.legend(loc = 'best')
plt.show()
for indx in range(4):
pca_analysis(indx)
# Now apply and see whether DT with kfold improves the performance
dtree_model = DecisionTreeRegressor()
results = kfold_cross_validation('Decision Tree K fold with PCA','13', results, dtree_model)
results
There is no improvement in the accuracy, infact 'DT with PCA' accuracy score is dropped compared to DT with kfold'.
df = filled_outliers_df.copy()
concrete_df_z = df.apply(zscore)
X = concrete_df_z.iloc[:,0:8]
y = concrete_df_z.iloc[:,8]
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 1)
model=RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Performance on training data using RFR:',model.score(X_train,y_train))
print('Performance on testing data using RFR:',model.score(X_test,y_test))
acc_RFR=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_RFR)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
results = pd.concat([results, pd.DataFrame({'Method':'Random Forest Regressor', 'accuracy': [acc_RFR]},index={'14'})])
results = results[['Method', 'accuracy']]
results
Observation:
There is an improvement in accuracy score using Random Forest Regressor algorithm.
results = kfold_cross_validation('Random Forest Regressor with Kfold', '15', results, model)
results
# Gradiant Boosting Regressor
model=GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Performance on training data using GBR:',model.score(X_train,y_train))
print('Performance on testing data using GBR:',model.score(X_test,y_test))
acc_GBR=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_GBR)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
results = pd.concat([results, pd.DataFrame({'Method':'Gradient Boosting Regressor', 'accuracy': [acc_GBR]},index={'16'})])
results = results[['Method', 'accuracy']]
results
results = kfold_cross_validation('Gradient Boosting Regressor with kfold', '17', results, model)
results
model=AdaBoostRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Performance on training data using ABR:',model.score(X_train,y_train))
print('Performance on testing data using ABR:',model.score(X_test,y_test))
acc_ABR=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_ABR)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
results = pd.concat([results, pd.DataFrame({'Method':'Ada Boosting Regressor', 'accuracy': [acc_ABR]},index={'18'})])
results = results[['Method', 'accuracy']]
results
results = kfold_cross_validation('Ada Boosting Regressor with kfold', '19', results, model)
results
model=BaggingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Performance on training data using GBR:',model.score(X_train,y_train))
print('Performance on testing data using GBR:',model.score(X_test,y_test))
acc_BR=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_BR)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
results = pd.concat([results, pd.DataFrame({'Method':'Bagging Regressor', 'accuracy': [acc_BR]},index={'20'})])
results = results[['Method', 'accuracy']]
results
results = kfold_cross_validation('Bagging Regressor with kfold', '21', results, model)
results
error=[]
for i in range(1,30):
knn = KNeighborsRegressor(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i!=y_test))
plt.figure(figsize=(12,6))
plt.plot(range(1,30),error,color='red', linestyle='dashed',marker='o',markerfacecolor='blue',markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean error')
#k=3
model = KNeighborsRegressor(n_neighbors=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Performance on training data using KNNR:',model.score(X_train,y_train))
print('Performance on testing data using KNNR:',model.score(X_test,y_test))
acc_K=metrics.r2_score(y_test, y_pred)
print('Accuracy KNNR: ',acc_K)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
results = pd.concat([results, pd.DataFrame({'Method':'KNN Regressor', 'accuracy': [acc_K]},index={'22'})])
results = results[['Method', 'accuracy']]
results
results = kfold_cross_validation('KNN Regressor with kfold', '23', results, model)
results
model = SVR(kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# performance on train data
print('Performance on training data using SVR:',model.score(X_train,y_train))
# performance on test data
print('Performance on testing data using SVR:',model.score(X_test,y_test))
#Evaluate the model using accuracy
acc_S=metrics.r2_score(y_test, y_pred)
print('Accuracy SVR: ',acc_S)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
results = pd.concat([results, pd.DataFrame({'Method':'SVR', 'accuracy': [acc_S]},index={'24'})])
results = results[['Method', 'accuracy']]
results
results = kfold_cross_validation('SVR with kfold', '25', results, model)
results
LR=LinearRegression()
KN=KNeighborsRegressor(n_neighbors=3)
SVM=svm.SVR(kernel='linear')
evc=VotingRegressor(estimators=[('LR',LR),('KN',KN),('SVM',SVM)])
evc.fit(X_train, y_train)
y_pred = evc.predict(X_test)
print('Performance on training data using ensemble:',evc.score(X_train,y_train))
print('Performance on testing data using ensemble:',evc.score(X_test,y_test))
acc_E=metrics.r2_score(y_test, y_pred)
print('Accuracy ensemble: ',acc_E)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
results = pd.concat([results, pd.DataFrame({'Method':'Ensemeble with voting', 'accuracy': [acc_E]},index={'26'})])
results = results[['Method', 'accuracy']]
results
results = kfold_cross_validation('Ensemeble with voting with kfold', '27', results, evc)
results
Inferences by looking at accuracy of all models:
concrete_XY = X.join(y)
values = concrete_XY.values
# Number of bootstrap samples to create
n_iterations = 1000
# size of a bootstrap sample
n_size = int(len(concrete_df_z) * 1)
# run bootstrap
# empty list that will hold the scores for each bootstrap iteration
stats = list()
for i in range(n_iterations):
# prepare train and test sets
train = resample(values, n_samples=n_size) # Sampling with replacement
test = np.array([x for x in values if x.tolist() not in train.tolist()]) # picking rest of the data not considered in sample
# fit model
gbmTree = GradientBoostingRegressor(n_estimators=50)
# fit against independent variables and corresponding target values
gbmTree.fit(train[:,:-1], train[:,-1])
# Take the target column for all rows in test set
y_test = test[:,-1]
# evaluate model
# predict based on independent variables in the test data
score = gbmTree.score(test[:, :-1] , y_test)
predictions = gbmTree.predict(test[:, :-1])
stats.append(score)
# plot scores
def plot_score(stats):
pyplot.hist(stats)
pyplot.show()
# confidence intervals
alpha = 0.95 # for 95% confidence
p = ((1.0-alpha)/2.0) * 100 # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))
plot_score(stats)
values = concrete_XY.values
# Number of bootstrap samples to create
n_iterations = 1000
# size of a bootstrap sample
n_size = int(len(concrete_df_z) * 1)
rfTree = RandomForestRegressor(n_estimators=100)
# run bootstrap
# empty list that will hold the scores for each bootstrap iteration
stats = list()
for i in range(n_iterations):
# prepare train and test sets
train = resample(values, n_samples=n_size) # Sampling with replacement
test = np.array([x for x in values if x.tolist() not in train.tolist()]) # picking rest of the data not considered in sample
# fit against independent variables and corresponding target values
rfTree.fit(train[:,:-1], train[:,-1])
# Take the target column for all rows in test set
y_test = test[:,-1]
# evaluate model
# predict based on independent variables in the test data
score = rfTree.score(test[:, :-1] , y_test)
predictions = rfTree.predict(test[:, :-1])
stats.append(score)
plot_score(stats)
The bootstrap random forest regressor model performance is between 84%-90.7% which is better than other classification algorithms.
# Divide the data into Train, validation, test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
# Step - 1:
rfTree.fit(X_train, y_train)
# Step - 2:
y_pred = rfTree.predict(X_val)
print('Performance on training data using Random Forest Regressor:',rfTree.score(X_train,y_train))
print('Performance on validation data using Random Forest Regressor:',rfTree.score(X_val,y_val))
accuracy=metrics.r2_score(y_val, y_pred)
print('Accuracy: ',accuracy)
print('MSE: ',metrics.mean_squared_error(y_val, y_pred))
Step - 3: Hyper parameter tuning
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
print(rfTree.get_params())
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
param_grid = {"max_depth": [3, None],
"max_features": ['auto', 'sqrt'],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4],
"bootstrap": [True, False]}
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# run randomized search
samples = 10 # number of random samples
randomCV = RandomizedSearchCV(rfTree, param_distributions=param_grid, n_iter=samples) #default cv = 3
randomCV.fit(X, y)
print(randomCV.best_params_)
randomCV.cv_results_['mean_test_score']
Inference: Using Randomized Search CV technique to tune the Randon Forest Regressor, giving performance of 88%.
# Computing the performance of tuned model on train.,valid and test datasets.
print('Performance on training data using RandomizedSearch CV on Randon Forest Regressor:',randomCV.score(X_train,y_train))
print('Performance on validation data RandomizedSearchCV on Random Forest Regressor:',randomCV.score(X_val,y_val))
print('Performance on test data RandomizedSearchCV on Random Forest Regressor:',randomCV.score(X_test,y_test))
y_pred = randomCV.predict(X_test)
accuracy=metrics.r2_score(y_test, y_pred)
print('Accuracy: ',accuracy)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
## Performance of the RandomizedSearchCV applied RFR is showing very good results.
## Lets apply kfold
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=seed)
results1 = cross_val_score(randomCV,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Kfold Average accuracy: ',accuracy)
print('Kfold Standard Deviation: ',results1.std())
results = pd.concat([results, pd.DataFrame({'Method':'RFR by applying RandomizedSearchCV with kfold', 'accuracy': [accuracy]},index={'28'})])
results = results[['Method', 'accuracy']]
results
# run grid search
gridCV = GridSearchCV(rfTree, param_grid=param_grid)
start = time()
gridCV.fit(X, y)
gridCV.best_params_
gridCV.cv_results_['mean_test_score']
# Computing the performance of tuned model on train.,valid and test datasets.
print('Performance on training data using GridSearch CV on Randon Forest Regressor:',gridCV.score(X_train,y_train))
print('Performance on validation data GridSearchCV on Random Forest Regressor:',gridCV.score(X_val,y_val))
print('Performance on test data GridSearchCV on Random Forest Regressor:',gridCV.score(X_test,y_test))
y_pred = gridCV.predict(X_test)
accuracy=metrics.r2_score(y_test, y_pred)
print('Accuracy: ',accuracy)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
## Performance of the GridSearchCV applied RFR is showing as overfit. Performing very good at Testing data, and poor in
## validation and test data
## Lets apply kfold
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=seed)
results1 = cross_val_score(gridCV,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Kfold Average accuracy: ',accuracy)
print('Kfold Standard Deviation: ',results1.std())
results = pd.concat([results, pd.DataFrame({'Method':'RFR by applying GridSearchCV with kfold', 'accuracy': [accuracy]},index={'28'})])
results = results[['Method', 'accuracy']]
results
Hence my recommendation is - "Random Forest Regressor" Model suits for predicting the 'strength' attribute using Concrete independent attributes.